The Thera bank recently saw a steep decline in the number of users of their credit card, credit cards are a good source of income for banks because of different kinds of fees charged by the banks like annual fees, balance transfer fees, and cash advance fees, late payment fees, foreign transaction fees, and others. Some fees are charged to every user irrespective of usage, while others are charged under specified circumstances.
Customers’ leaving credit cards services would lead bank to loss, so the bank wants to analyze the data of customers and identify the customers who will leave their credit card services and reason for same – so that bank could improve upon those areas.
# this will help in making the Python code more structured automatically (good coding practice)
%load_ext nb_black
# library to suppress warnings or deprecation notes
import warnings
warnings.filterwarnings("ignore")
# libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
# library to split data
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
# libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# remove the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# set the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
from sklearn import metrics
# library to build Linear Regression Model
from sklearn.linear_model import LogisticRegression
# library to encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# libraries to build decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# libraries to build ensemble models
from sklearn.ensemble import (
BaggingClassifier,
RandomForestClassifier,
AdaBoostClassifier,
GradientBoostingClassifier,
)
# libraries to build xgboost model
from xgboost import XGBClassifier
# library for stacking classifier
from sklearn.ensemble import StackingClassifier
# to tune different models
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
# to get diferent metric scores
from sklearn.metrics import (
recall_score,
confusion_matrix,
)
# for oversampling and undersampling data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# for missing value imputation
from sklearn.impute import SimpleImputer
# histogram and boxplot for the feature
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="orange"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="Winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(data[feature].median(), color="blue", linestyle="-")
plt.show() # show the plot
# labeled_barplot
def labeled_barplot(data, feature, perc=False, v_ticks=True, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
if v_ticks is True:
plt.xticks(rotation=90)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 6))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
# to plot confusion matrix
def draw_matrix(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()
# calculate different metric scores of the model - Accuracy, Recall, F1 and Precision
def get_metrics_score(model,flag=True):
"""
model : classifier to predict values of X
"""
# defining an empty list to store results
score_list=[]
accuracy_list=[]
precision_list=[]
recall_list=[]
f1_list=[]
# predicting on train and tests
pred_train = model.predict(X_train)
pred_val = model.predict(X_val)
pred_test = model.predict(X_test)
# accuracy of the model
train_acc = model.score(X_train,y_train)
val_acc = model.score(X_val,y_val)
test_acc = model.score(X_test,y_test)
# recall of the model
train_recall = metrics.recall_score(y_train,pred_train)
val_recall = metrics.recall_score(y_val,pred_val)
test_recall = metrics.recall_score(y_test,pred_test)
# precision of the model
train_precision = metrics.precision_score(y_train,pred_train)
val_precision = metrics.precision_score(y_val,pred_val)
test_precision = metrics.precision_score(y_test,pred_test)
# f1_score of the model
train_f1 = metrics.f1_score(y_train,pred_train)
val_f1 = metrics.f1_score(y_val,pred_val)
test_f1 = metrics.f1_score(y_test,pred_test)
# populate the score_list
score_list.extend((train_acc,val_acc,test_acc,train_recall,val_recall,test_recall,train_precision,
val_precision,test_precision,train_f1,val_f1,test_f1))
#list per data set
accuracy_list.extend((train_acc,val_acc,test_acc))
recall_list.extend((train_recall,val_recall,test_recall))
precision_list.extend((train_precision,val_precision,test_precision))
f1_list.extend((train_f1,val_f1,test_f1))
# If the flag is set to True then only the following print statements will be dispayed. The default value is set to True.
if flag == True:
print("Accuracy on training set : ",train_acc)
print("Accuracy on validation set : ",val_acc)
print("Accuracy on test set : ",test_acc)
print("Recall on training set : ",train_recall)
print("Recall on validation set : ",val_recall)
print("Recall on test set : ",test_recall)
print("Precision on training set : ",train_precision)
print("Precision on validation set : ",val_precision)
print("Precision on test set : ",test_precision)
print("F1 on training set : ",train_f1)
print("F1 on validation set : ",val_f1)
print("F1 on test set : ",test_f1)
model_df = pd.DataFrame({"DataSet":["Training", "Validation", "Testing"],
"Accuracy": accuracy_list, "Recall": recall_list,
"Precision": precision_list,"F1": f1_list})
return model_df, score_list # returning the list with train and test scores
# load the file using excel. There are two sheets in the excel. Load the Tourism sheet
df = pd.read_csv("BankChurners.csv")
# back up data to preserve the initial version for reference
df_back = df.copy()
# print the data set information as number of rows and columns
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.") # f-string
There are 10127 rows and 21 columns.
# check the dataset information
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
Observations:
# setting the random seed via np.random.seed to get the same random results every time
np.random.seed(1)
# also look at random 10 sample rows
df.sample(n=10)
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6498 | 712389108 | Existing Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Blue | 36 | 6 | 3 | 2 | 2570.0 | 2107 | 463.0 | 0.651 | 4058 | 83 | 0.766 | 0.820 |
| 9013 | 718388733 | Existing Customer | 38 | F | 1 | College | NaN | Less than $40K | Blue | 32 | 2 | 3 | 3 | 2609.0 | 1259 | 1350.0 | 0.871 | 8677 | 96 | 0.627 | 0.483 |
| 2053 | 710109633 | Existing Customer | 39 | M | 2 | College | Married | $60K - $80K | Blue | 31 | 6 | 3 | 2 | 9871.0 | 1061 | 8810.0 | 0.545 | 1683 | 34 | 0.478 | 0.107 |
| 3211 | 717331758 | Existing Customer | 44 | M | 4 | Graduate | Married | $120K + | Blue | 32 | 6 | 3 | 4 | 34516.0 | 2517 | 31999.0 | 0.765 | 4228 | 83 | 0.596 | 0.073 |
| 5559 | 709460883 | Attrited Customer | 38 | F | 2 | Doctorate | Married | Less than $40K | Blue | 28 | 5 | 2 | 4 | 1614.0 | 0 | 1614.0 | 0.609 | 2437 | 46 | 0.438 | 0.000 |
| 6106 | 789105183 | Existing Customer | 54 | M | 3 | Post-Graduate | Single | $80K - $120K | Silver | 42 | 3 | 1 | 2 | 34516.0 | 2488 | 32028.0 | 0.552 | 4401 | 87 | 0.776 | 0.072 |
| 4150 | 771342183 | Attrited Customer | 53 | F | 3 | Graduate | Single | $40K - $60K | Blue | 40 | 6 | 3 | 2 | 1625.0 | 0 | 1625.0 | 0.689 | 2314 | 43 | 0.433 | 0.000 |
| 2205 | 708174708 | Existing Customer | 38 | M | 4 | Graduate | Married | $40K - $60K | Blue | 27 | 6 | 2 | 4 | 5535.0 | 1276 | 4259.0 | 0.636 | 1764 | 38 | 0.900 | 0.231 |
| 4145 | 718076733 | Existing Customer | 43 | M | 1 | Graduate | Single | $60K - $80K | Silver | 31 | 4 | 3 | 3 | 25824.0 | 1170 | 24654.0 | 0.684 | 3101 | 73 | 0.780 | 0.045 |
| 5324 | 821889858 | Attrited Customer | 50 | F | 1 | Doctorate | Single | abc | Blue | 46 | 6 | 4 | 3 | 1970.0 | 1477 | 493.0 | 0.662 | 2493 | 44 | 0.571 | 0.750 |
Observations:
# number of missing values in columns
df.isna().sum()
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
# precentage of missing values in columns
round(df.isna().sum() / df.isna().count() * 100, 2)
CLIENTNUM 0.0 Attrition_Flag 0.0 Customer_Age 0.0 Gender 0.0 Dependent_count 0.0 Education_Level 15.0 Marital_Status 7.4 Income_Category 0.0 Card_Category 0.0 Months_on_book 0.0 Total_Relationship_Count 0.0 Months_Inactive_12_mon 0.0 Contacts_Count_12_mon 0.0 Credit_Limit 0.0 Total_Revolving_Bal 0.0 Avg_Open_To_Buy 0.0 Total_Amt_Chng_Q4_Q1 0.0 Total_Trans_Amt 0.0 Total_Trans_Ct 0.0 Total_Ct_Chng_Q4_Q1 0.0 Avg_Utilization_Ratio 0.0 dtype: float64
Observations:
# let"s check for duplicate values in the data
df.duplicated().sum()
0
Observations:
There are no duplicated values.
# let"s view the statistical summary of the numerical columns in the data
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CLIENTNUM | 10127.0 | 7.391776e+08 | 3.690378e+07 | 708082083.0 | 7.130368e+08 | 7.179264e+08 | 7.731435e+08 | 8.283431e+08 |
| Customer_Age | 10127.0 | 4.632596e+01 | 8.016814e+00 | 26.0 | 4.100000e+01 | 4.600000e+01 | 5.200000e+01 | 7.300000e+01 |
| Dependent_count | 10127.0 | 2.346203e+00 | 1.298908e+00 | 0.0 | 1.000000e+00 | 2.000000e+00 | 3.000000e+00 | 5.000000e+00 |
| Months_on_book | 10127.0 | 3.592841e+01 | 7.986416e+00 | 13.0 | 3.100000e+01 | 3.600000e+01 | 4.000000e+01 | 5.600000e+01 |
| Total_Relationship_Count | 10127.0 | 3.812580e+00 | 1.554408e+00 | 1.0 | 3.000000e+00 | 4.000000e+00 | 5.000000e+00 | 6.000000e+00 |
| Months_Inactive_12_mon | 10127.0 | 2.341167e+00 | 1.010622e+00 | 0.0 | 2.000000e+00 | 2.000000e+00 | 3.000000e+00 | 6.000000e+00 |
| Contacts_Count_12_mon | 10127.0 | 2.455317e+00 | 1.106225e+00 | 0.0 | 2.000000e+00 | 2.000000e+00 | 3.000000e+00 | 6.000000e+00 |
| Credit_Limit | 10127.0 | 8.631954e+03 | 9.088777e+03 | 1438.3 | 2.555000e+03 | 4.549000e+03 | 1.106750e+04 | 3.451600e+04 |
| Total_Revolving_Bal | 10127.0 | 1.162814e+03 | 8.149873e+02 | 0.0 | 3.590000e+02 | 1.276000e+03 | 1.784000e+03 | 2.517000e+03 |
| Avg_Open_To_Buy | 10127.0 | 7.469140e+03 | 9.090685e+03 | 3.0 | 1.324500e+03 | 3.474000e+03 | 9.859000e+03 | 3.451600e+04 |
| Total_Amt_Chng_Q4_Q1 | 10127.0 | 7.599407e-01 | 2.192068e-01 | 0.0 | 6.310000e-01 | 7.360000e-01 | 8.590000e-01 | 3.397000e+00 |
| Total_Trans_Amt | 10127.0 | 4.404086e+03 | 3.397129e+03 | 510.0 | 2.155500e+03 | 3.899000e+03 | 4.741000e+03 | 1.848400e+04 |
| Total_Trans_Ct | 10127.0 | 6.485869e+01 | 2.347257e+01 | 10.0 | 4.500000e+01 | 6.700000e+01 | 8.100000e+01 | 1.390000e+02 |
| Total_Ct_Chng_Q4_Q1 | 10127.0 | 7.122224e-01 | 2.380861e-01 | 0.0 | 5.820000e-01 | 7.020000e-01 | 8.180000e-01 | 3.714000e+00 |
| Avg_Utilization_Ratio | 10127.0 | 2.748936e-01 | 2.756915e-01 | 0.0 | 2.300000e-02 | 1.760000e-01 | 5.030000e-01 | 9.990000e-01 |
Observations:
# create numerical columns list
num_cols = df.select_dtypes(include=["int64", "float64"])
# create categorical columns list
cat_cols = df.select_dtypes(exclude=["int64", "float64"])
#get the valuecounts
for i in cat_cols:
print(df[i].value_counts())
print("-"*50)
print("\n")
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64 -------------------------------------------------- F 5358 M 4769 Name: Gender, dtype: int64 -------------------------------------------------- Graduate 3128 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 -------------------------------------------------- Married 4687 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64 -------------------------------------------------- Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64 -------------------------------------------------- Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64 --------------------------------------------------
Observations:
Education_Level, Income_Category and Card_Category can be checked for influence on Marital_Status
df[df["Marital_Status"].isna()]["Card_Category"].value_counts()
Blue 683 Silver 52 Gold 12 Platinum 2 Name: Card_Category, dtype: int64
df[df["Card_Category"] == "Blue"]["Marital_Status"].value_counts()
Married 4433 Single 3624 Divorced 696 Name: Marital_Status, dtype: int64
df[df["Marital_Status"].isna()]["Income_Category"].value_counts()
Less than $40K 250 $80K - $120K 136 $40K - $60K 132 $60K - $80K 102 abc 82 $120K + 47 Name: Income_Category, dtype: int64
df[df["Income_Category"] == "Less than $40K"]["Marital_Status"].value_counts()
Married 1628 Single 1429 Divorced 254 Name: Marital_Status, dtype: int64
df[df["Marital_Status"].isna()]["Education_Level"].value_counts()
Graduate 227 High School 154 Uneducated 109 College 74 Post-Graduate 43 Doctorate 28 Name: Education_Level, dtype: int64
Observations:
# use simple imputer to impute the most_frequent values in categorical column
mar_imputer = SimpleImputer(missing_values=np.NaN, strategy="most_frequent")
df.Marital_Status = mar_imputer.fit_transform(df["Marital_Status"].values.reshape(-1,1))[:,0]
df["Marital_Status"].value_counts()
Married 5436 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64
Observations:
NULL Values are treated.
Income_Category can be checked for influence on Education_Level
df[df["Education_Level"].isna()]["Income_Category"].value_counts()
Less than $40K 556 $40K - $60K 269 $80K - $120K 219 $60K - $80K 210 abc 145 $120K + 120 Name: Income_Category, dtype: int64
df[df["Income_Category"] == "Less than $40K"]["Education_Level"].value_counts()
Graduate 1139 High School 671 Uneducated 522 College 345 Post-Graduate 170 Doctorate 158 Name: Education_Level, dtype: int64
Observations:
# use simple imputer to impute the most_frequent values in categorical column
edu_imputer = SimpleImputer(missing_values=np.NaN, strategy="most_frequent")
df.Education_Level = edu_imputer.fit_transform(df["Education_Level"].values.reshape(-1,1))[:,0]
df["Education_Level"].value_counts()
Graduate 4647 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64
Observations:
NULL Values are treated.
# precentage of missing values in columns
round(df.isna().sum() / df.isna().count() * 100, 2)
CLIENTNUM 0.0 Attrition_Flag 0.0 Customer_Age 0.0 Gender 0.0 Dependent_count 0.0 Education_Level 0.0 Marital_Status 0.0 Income_Category 0.0 Card_Category 0.0 Months_on_book 0.0 Total_Relationship_Count 0.0 Months_Inactive_12_mon 0.0 Contacts_Count_12_mon 0.0 Credit_Limit 0.0 Total_Revolving_Bal 0.0 Avg_Open_To_Buy 0.0 Total_Amt_Chng_Q4_Q1 0.0 Total_Trans_Amt 0.0 Total_Trans_Ct 0.0 Total_Ct_Chng_Q4_Q1 0.0 Avg_Utilization_Ratio 0.0 dtype: float64
Observations:
We see that all null values are now treated
# check value_counts
df.Income_Category.value_counts()
Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64
# check Eduation_Level values for income_category abc
df[df["Income_Category"] == "abc"]["Education_Level"].value_counts()
Graduate 477 High School 225 Uneducated 185 College 108 Doctorate 70 Post-Graduate 47 Name: Education_Level, dtype: int64
Observations:
# treating error
ic_imputer = SimpleImputer(missing_values="abc", strategy="most_frequent")
df.Income_Category = ic_imputer.fit_transform(df["Income_Category"].values.reshape(-1,1))[:,0]
# verify the update
df.Income_Category.value_counts()
Less than $40K 4673 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 $120K + 727 Name: Income_Category, dtype: int64
Observations:
Income_Category "abc" is now treated and imputed with most_frequent value "Less than $40k"
# values before encoding
df.Attrition_Flag.value_counts()
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64
# instantiate the LabelEncoder
encode_attr = LabelEncoder()
# fit the encoder
encode_attr.fit(df["Attrition_Flag"])
# encode the variable
df["Attrition_Flag"] = encode_attr.transform(df["Attrition_Flag"])
# verify the update
df.Attrition_Flag.value_counts()
1 8500 0 1627 Name: Attrition_Flag, dtype: int64
Observations:
The target columne is now encoded with Existing Customers as 1 and Attrited Customers as 0.
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"CLIENTNUM")
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Attrition_Flag", True, False)
Observations:
Target variable shows 83.9% customers are existing and 16.1% are attrited.
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Customer_Age")
df.loc[df["Customer_Age"] > 70]
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 251 | 715952883 | 1 | 73 | M | 0 | High School | Married | $40K - $60K | Blue | 36 | 5 | 3 | 2 | 4469.0 | 1125 | 3344.0 | 1.363 | 1765 | 34 | 1.615 | 0.252 |
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Gender", True, False)
Observations:
Number of Female customers is higher than Male customers.
# use label_barplot function to plot the graph
labeled_barplot(df,"Dependent_count", True, False)
Observations:
Most customers have 3 dependents followed by customers with 2 and 1 dependents.
# use label_barplot function to plot the graph
labeled_barplot(df,"Education_Level", True, True)
Observations:
Most customers are Graduate followed by High School.
# use label_barplot function to plot the graph
labeled_barplot(df,"Marital_Status", True, False)
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Income_Category", True, True)
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Card_Category", True, False)
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Months_on_book")
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Total_Relationship_Count", True, False)
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Months_Inactive_12_mon", True, False)
Observations:
# use label_barplot function to plot the graph
labeled_barplot(df,"Contacts_Count_12_mon", True, False)
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Credit_Limit")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Total_Revolving_Bal")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Avg_Open_To_Buy")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Total_Amt_Chng_Q4_Q1")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Total_Trans_Amt")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Total_Trans_Ct")
df.loc[df["Total_Trans_Ct"] > 130]
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9213 | 719413383 | 1 | 32 | M | 1 | Uneducated | Single | $60K - $80K | Silver | 36 | 2 | 3 | 1 | 33711.0 | 1437 | 32274.0 | 0.942 | 14880 | 134 | 0.654 | 0.043 |
| 9261 | 716430933 | 1 | 46 | F | 4 | Graduate | Single | Less than $40K | Blue | 36 | 2 | 1 | 2 | 3973.0 | 0 | 3973.0 | 0.890 | 13740 | 131 | 0.819 | 0.000 |
| 9269 | 711010683 | 1 | 35 | M | 3 | High School | Married | $40K - $60K | Blue | 27 | 1 | 2 | 3 | 14382.0 | 1950 | 12432.0 | 0.578 | 12941 | 131 | 0.819 | 0.136 |
| 9324 | 708163758 | 1 | 41 | M | 3 | Graduate | Married | $120K + | Blue | 33 | 2 | 4 | 3 | 34516.0 | 638 | 33878.0 | 0.724 | 13085 | 139 | 0.675 | 0.018 |
| 9339 | 715728108 | 1 | 27 | F | 0 | Graduate | Married | Less than $40K | Blue | 36 | 1 | 1 | 2 | 4548.0 | 1450 | 3098.0 | 0.844 | 14330 | 131 | 0.638 | 0.319 |
| 9586 | 784868958 | 1 | 56 | F | 1 | High School | Married | Less than $40K | Blue | 49 | 1 | 2 | 1 | 17542.0 | 2517 | 15025.0 | 0.800 | 13939 | 138 | 0.792 | 0.143 |
| 9629 | 709015833 | 1 | 42 | M | 2 | Graduate | Single | $60K - $80K | Silver | 36 | 3 | 3 | 2 | 34516.0 | 0 | 34516.0 | 0.774 | 12920 | 132 | 0.737 | 0.000 |
| 9728 | 710699283 | 1 | 46 | M | 2 | Graduate | Single | $120K + | Blue | 28 | 1 | 1 | 1 | 7790.0 | 1921 | 5869.0 | 0.789 | 14567 | 131 | 0.617 | 0.247 |
| 9841 | 715601808 | 1 | 50 | M | 3 | Graduate | Married | $120K + | Blue | 41 | 4 | 2 | 3 | 34516.0 | 2253 | 32263.0 | 1.032 | 16692 | 131 | 0.795 | 0.065 |
| 10085 | 717714633 | 1 | 49 | M | 3 | Uneducated | Married | $120K + | Gold | 38 | 4 | 3 | 4 | 14938.0 | 0 | 14938.0 | 0.737 | 15277 | 131 | 0.724 | 0.000 |
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Total_Ct_Chng_Q4_Q1")
Observations:
# use the defined function histogram_boxplot to plot the graphs
histogram_boxplot(df,"Avg_Utilization_Ratio")
Observations:
# for all numerical variables draw box plots
plt.figure(figsize=(20,30))
for i, variable in enumerate(num_cols):
plt.subplot(5,3,i+1)
sns.boxplot(df["Attrition_Flag"],df[variable],palette="Set1")
plt.tight_layout()
plt.title(variable)
plt.show()
Observations:
# use the defined function stacked_barplot to plot the graphs
stacked_barplot(df, "Gender", "Attrition_Flag")
Attrition_Flag 0 1 All Gender All 1627 8500 10127 F 930 4428 5358 M 697 4072 4769 ------------------------------------------------------------------------------------------------------------------------
Observations:
Number of Male Customers closing the Credit Card account is slighlty lesser than Female Customers.
# use the defined function stacked_barplot to plot the graphs
stacked_barplot(df, "Education_Level", "Attrition_Flag")
Attrition_Flag 0 1 All Education_Level All 1627 8500 10127 Graduate 743 3904 4647 High School 306 1707 2013 Uneducated 237 1250 1487 College 154 859 1013 Doctorate 95 356 451 Post-Graduate 92 424 516 ------------------------------------------------------------------------------------------------------------------------
Observations:
# use the defined function stacked_barplot to plot the graphs
stacked_barplot(df, "Marital_Status", "Attrition_Flag")
Attrition_Flag 0 1 All Marital_Status All 1627 8500 10127 Married 838 4598 5436 Single 668 3275 3943 Divorced 121 627 748 ------------------------------------------------------------------------------------------------------------------------
Observations:
# use the defined function stacked_barplot to plot the graphs
stacked_barplot(df, "Income_Category", "Attrition_Flag")
Attrition_Flag 0 1 All Income_Category All 1627 8500 10127 Less than $40K 799 3874 4673 $40K - $60K 271 1519 1790 $80K - $120K 242 1293 1535 $60K - $80K 189 1213 1402 $120K + 126 601 727 ------------------------------------------------------------------------------------------------------------------------
Observations:
# use the defined function stacked_barplot to plot the graphs
stacked_barplot(df, "Card_Category", "Attrition_Flag")
Attrition_Flag 0 1 All Card_Category All 1627 8500 10127 Blue 1519 7917 9436 Silver 82 473 555 Gold 21 95 116 Platinum 5 15 20 ------------------------------------------------------------------------------------------------------------------------
Observations:
Platinum card members are most likely to close the credit card compared to other categories.
# heatmap for correlation
plt.figure(figsize=(15, 7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
Observations:
# heatmap for correlation
sns.pairplot(df,hue="Attrition_Flag")
<seaborn.axisgrid.PairGrid at 0x21704bbcca0>
Observations:
As we have seen during bivariate analysis there are no clear indicators of attrition as all variables show similar data for both existing customers and attrited customers.
# check the outliers before treating
plt.figure(figsize=(20, 30))
for i, variable in enumerate(num_cols):
plt.subplot(5, 4, i + 1)
plt.boxplot(df[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
# find the 25th percentile and 75th percentile.
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
# Inter Quantile Range (75th percentile - 25th percentile)
IQR = Q3 - Q1
# find lower and upper bounds for all values. All values outside these bounds are outliers
lower=Q1-1.5*IQR
upper=Q3+1.5*IQR
((num_cols<lower)|(num_cols>upper)).sum()/len(df)*100
Attrition_Flag 0.000000 Avg_Open_To_Buy 9.509233 Avg_Utilization_Ratio 0.000000 CLIENTNUM 0.000000 Contacts_Count_12_mon 6.211119 Credit_Limit 9.716599 Customer_Age 0.019749 Dependent_count 0.000000 Months_Inactive_12_mon 3.268490 Months_on_book 3.811593 Total_Amt_Chng_Q4_Q1 3.900464 Total_Ct_Chng_Q4_Q1 3.910339 Total_Relationship_Count 0.000000 Total_Revolving_Bal 0.000000 Total_Trans_Amt 8.847635 Total_Trans_Ct 0.019749 dtype: float64
Observations:
# Separating target variable and other variables
#X = df.drop(["CLIENTNUM","Attrition_Flag","Total_Ct_Chng_Q4_Q1","Total_Amt_Chng_Q4_Q1","Avg_Utilization_Ratio"], axis=1)
#X = df.drop(["CLIENTNUM","Attrition_Flag"], axis=1)
X = df.drop(["Attrition_Flag"], axis=1)
y = df["Attrition_Flag"]
X = pd.get_dummies(data=X, drop_first=True)
Training Set to have 60% data and Validation and Testing sets to have 20% data each
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 30) (2026, 30) (2026, 30)
# use SMOTE - Synthetic Minority Over Sampling Technique to create oversampled training sets
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
)
# fit the sampler and created undersampled data
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
# use randomundersampler for dataset processing
rus = RandomUnderSampler(random_state=1)
# fit the sampler and created undersampled data
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)
# Empty list to store all the models
models = []
# Appending models into the list
models.append(("Logistic regression", LogisticRegression(random_state=1)))
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("Gradient Boosting", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("Decision tree", DecisionTreeClassifier(random_state=1)))
# Empty list to store all model"s CV scores
results = []
# best_scores
best_scores = []
# Empty list to store name of the models
names = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance on Training Set:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
best_scores.append(round(cv_result.mean() * 100,2))
print("\n" "Training Set Performance:" "\n")
# fit and predict the models training set
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_train, model.predict(X_train)) * 100
print("{}: {}".format(name, scores))
print("\n" "Validation Set Performance:" "\n")
# predict the models on validation set
for name, model in models:
scores = recall_score(y_val, model.predict(X_val)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance on Training Set: Logistic regression: 100.0 Bagging: 97.33314091092767 Random forest: 98.58810059843368 Gradient Boosting: 98.86261040235524 Adaboost: 97.8427716523312 Xgboost: 98.60757374588698 Decision tree: 95.97979564740517 Training Set Performance: Logistic regression: 100.0 Bagging: 99.76465973720337 Random forest: 100.0 Gradient Boosting: 99.27436752304374 Adaboost: 98.41145322612277 Xgboost: 100.0 Decision tree: 100.0 Validation Set Performance: Logistic regression: 100.0 Bagging: 97.76470588235294 Random forest: 98.47058823529412 Gradient Boosting: 98.82352941176471 Adaboost: 97.88235294117648 Xgboost: 99.11764705882354 Decision tree: 96.70588235294117
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(15, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
Observations:
# Empty list to store all model"s CV scores
results_over = []
# best_scores
best_scores_over = []
# Empty list to store name of the models
names_over = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance on Oversampled Training Set:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
results_over.append(cv_result)
names_over.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
best_scores_over.append(round(cv_result.mean() * 100,2))
print("\n" "Oversampled Training Performance:" "\n")
# fit and predict the models training set
for name, model in models:
model.fit(X_train_over, y_train_over)
scores_over = recall_score(y_train_over, model.predict(X_train_over)) * 100
print("{}: {}".format(name, scores_over))
print("\n" "Validation Set Performance:" "\n")
# predict the models on validation set
for name, model in models:
scores = recall_score(y_val, model.predict(X_val)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance on Oversampled Training Set: Logistic regression: 100.0 Bagging: 93.54790355789028 Random forest: 96.15628547788106 Gradient Boosting: 96.70532432796476 Adaboost: 95.4697800611903 Xgboost: 97.58786969154689 Decision tree: 92.90072928091749 Oversampled Training Performance: Logistic regression: 100.0 Bagging: 99.4508727201412 Random forest: 100.0 Gradient Boosting: 97.37203373210434 Adaboost: 95.82271033535987 Xgboost: 100.0 Decision tree: 100.0 Validation Set Performance: Logistic regression: 100.0 Bagging: 95.17647058823529 Random forest: 96.88235294117648 Gradient Boosting: 96.76470588235294 Adaboost: 95.47058823529412 Xgboost: 98.11764705882354 Decision tree: 93.82352941176471
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(15, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results_over)
ax.set_xticklabels(names_over)
plt.show()
Observations:
# Empty list to store all model"s CV scores
results_under = []
# best scores
best_scores_under = []
# Empty list to store name of the models
names_under = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance on Undersampled Training set:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_under, y=y_train_under, scoring=scoring, cv=kfold
)
results_under.append(cv_result)
names_under.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
best_scores_under.append(round(cv_result.mean() * 100,2))
print("\n" "Undersampled Training Performance:" "\n")
# fit and predict the models training set
for name, model in models:
model.fit(X_train_under, y_train_under)
scores_under = recall_score(y_train_under, model.predict(X_train_under)) * 100
print("{}: {}".format(name, scores_under))
print("\n" "Validation Set Performance:" "\n")
# predict the models on validation set
for name, model in models:
scores = recall_score(y_val, model.predict(X_val)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance on Undersampled Training set: Logistic regression: 71.96232339089482 Bagging: 89.95604395604396 Random forest: 91.49450549450549 Gradient Boosting: 93.33961276818418 Adaboost: 91.18942961800104 Xgboost: 93.95447409733123 Decision tree: 87.80847723704866 Undersampled Training Performance: Logistic regression: 62.090163934426236 Bagging: 99.38524590163934 Random forest: 100.0 Gradient Boosting: 97.02868852459017 Adaboost: 94.4672131147541 Xgboost: 100.0 Decision tree: 100.0 Validation Set Performance: Logistic regression: 63.0 Bagging: 91.64705882352942 Random forest: 94.35294117647058 Gradient Boosting: 95.11764705882354 Adaboost: 92.82352941176471 Xgboost: 95.94117647058825 Decision tree: 91.64705882352942
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(15, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results_under)
ax.set_xticklabels(names_under)
plt.show()
Observations:
# create dataframe
comparison_frame = pd.DataFrame({"Model":["Logistic regression","Bagging","Random forest","Gradient Boosting","Adaboost",
"Xgboost","Decision tree"],
"Training Set": best_scores, "Oversampled Training": best_scores_over,
"Undersampled Training": best_scores_under})
comparison_frame
| Model | Training Set | Oversampled Training | Undersampled Training | |
|---|---|---|---|---|
| 0 | Logistic regression | 100.00 | 100.00 | 71.96 |
| 1 | Bagging | 97.33 | 93.55 | 89.96 |
| 2 | Random forest | 98.59 | 96.16 | 91.49 |
| 3 | Gradient Boosting | 98.86 | 96.71 | 93.34 |
| 4 | Adaboost | 97.84 | 95.47 | 91.19 |
| 5 | Xgboost | 98.61 | 97.59 | 93.95 |
| 6 | Decision tree | 95.98 | 92.90 | 87.81 |
Observations:
# random forest classifier
rfc = RandomForestClassifier(random_state=1)
# type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# grid of parameters to choose from
param_grid_rfc = {"n_estimators": np.arange(50,150,50),
"min_samples_leaf": np.arange(1,6,1),
"max_features":["log2",0.7,0.9,"auto"],
"max_samples": np.arange(0.3, 0.7, None),
"max_depth":np.arange(1,5,1),
}
%%time
# run the grid search
rfc_tuned1 = GridSearchCV(estimator=rfc, param_grid=param_grid_rfc, scoring=scorer, cv=5, n_jobs = -1, verbose= 2)
# Fitting parameters in GridSeachCV
rfc_tuned1.fit(X_train, y_train)
# print best parameters
print("Best parameters are {} with CV score={}:" .format(rfc_tuned1.best_params_,rfc_tuned1.best_score_))
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Best parameters are {'max_depth': 1, 'max_features': 'log2', 'max_samples': 0.3, 'min_samples_leaf': 1, 'n_estimators': 50} with CV score=1.0:
Wall time: 1min 24s
# building model with best parameters
rfc_tuned_gcv = RandomForestClassifier(
random_state=1,
max_features="log2",
max_samples=0.3,
max_depth=1,
min_samples_leaf=1,
n_estimators=50,
)
# Fit the model on training data
rfc_tuned_gcv.fit(X_train, y_train)
RandomForestClassifier(max_depth=1, max_features='log2', max_samples=0.3,
n_estimators=50, random_state=1)
# calculating different metrics
rfc_tuned_gcv_score, rfc_tuned_gcv_list = get_metrics_score(
rfc_tuned_gcv, False
)
# model performance
rfc_tuned_gcv_score
| DataSet | Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|---|
| 0 | Training | 0.839342 | 1.0 | 0.839342 | 0.912654 |
| 1 | Validation | 0.839092 | 1.0 | 0.839092 | 0.912507 |
| 2 | Testing | 0.839585 | 1.0 | 0.839585 | 0.912798 |
# use draw_matrix function
draw_matrix(rfc_tuned_gcv, X_val, y_val)
Observations:
%%time
# Calling RandomizedSearchCV
rfc_tuned2 = RandomizedSearchCV(estimator=rfc, param_distributions=param_grid_rfc, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
# Fitting parameters in RandomizedSearchCV
rfc_tuned2.fit(X_train, y_train)
# print best parameters
print("Best parameters are {} with CV score={}:" .format(rfc_tuned2.best_params_,rfc_tuned2.best_score_))
Best parameters are {'n_estimators': 100, 'min_samples_leaf': 5, 'max_samples': 0.3, 'max_features': 0.9, 'max_depth': 1} with CV score=1.0:
Wall time: 19 s
# building model with best parameters
rfc_tuned_rcv = RandomForestClassifier(
random_state=1,
max_features=0.9,
max_samples=0.3,
min_samples_leaf=5,
n_estimators=100,
max_depth=1,
)
# Fit the model on training data
rfc_tuned_rcv.fit(X_train, y_train)
RandomForestClassifier(max_depth=1, max_features=0.9, max_samples=0.3,
min_samples_leaf=5, random_state=1)
# calculating different metrics
rfc_tuned_rcv_score, rfc_tuned_rcv_list = get_metrics_score(
rfc_tuned_rcv, False
)
# model performance
rfc_tuned_rcv_score
| DataSet | Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|---|
| 0 | Training | 0.839342 | 1.0 | 0.839342 | 0.912654 |
| 1 | Validation | 0.839092 | 1.0 | 0.839092 | 0.912507 |
| 2 | Testing | 0.839585 | 1.0 | 0.839585 | 0.912798 |
# use draw_matrix function
draw_matrix(rfc_tuned_rcv, X_val, y_val)
Observations:
#defining model
xgb = XGBClassifier(random_state=1,eval_metric="logloss")
# Parameter grid
param_grid_xgb ={"n_estimators":np.arange(50,150,50),
"scale_pos_weight":[2,5,10],
"learning_rate":[0.01,0.1,0.2,0.05],
"gamma":[0,1,3,5],
"subsample":[0.8,0.9,1],
"max_depth":np.arange(1,5,1),
"reg_lambda":[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
%%time
# Calling GridSearchCV
xgb_tuned1 = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, scoring=scorer, cv=5, n_jobs = -1, verbose= 2)
# Fitting parameters in GridSeachCV
xgb_tuned1.fit(X_train,y_train)
# print best parameters
print("Best parameters are {} with CV score={}:" .format(xgb_tuned1.best_params_,xgb_tuned1.best_score_))
Fitting 5 folds for each of 2304 candidates, totalling 11520 fits
Best parameters are {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50, 'reg_lambda': 5, 'scale_pos_weight': 2, 'subsample': 0.8} with CV score=1.0:
Wall time: 36min 52s
# building model with best parameters
xgb_tuned_gcv = XGBClassifier(
random_state=1,
n_estimators=50,
scale_pos_weight=2,
subsample=0.8,
learning_rate=0.01,
gamma=0,
eval_metric="logloss",
reg_lambda=5,
max_depth=1,
)
# Fit the model on training data
xgb_tuned_gcv.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
eval_metric='logloss', gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.01, max_delta_step=0,
max_depth=1, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=50, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=1,
reg_alpha=0, reg_lambda=5, scale_pos_weight=2, subsample=0.8,
tree_method='exact', validate_parameters=1, verbosity=None)
# calculating different metrics
xgb_tuned_gcv_score, xgb_tuned_gcv_list = get_metrics_score(
xgb_tuned_gcv, False
)
# model performance
xgb_tuned_gcv_score
| DataSet | Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|---|
| 0 | Training | 0.839342 | 1.0 | 0.839342 | 0.912654 |
| 1 | Validation | 0.839092 | 1.0 | 0.839092 | 0.912507 |
| 2 | Testing | 0.839585 | 1.0 | 0.839585 | 0.912798 |
# use draw_matrix function
draw_matrix(xgb_tuned_gcv, X_val, y_val)
Observations:
%%time
# Calling RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid_xgb, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
# Fitting parameters
xgb_tuned2.fit(X_train,y_train)
# print best parameters
print("Best parameters are {} with CV score={}:" .format(xgb_tuned2.best_params_,xgb_tuned2.best_score_))
Best parameters are {'subsample': 0.8, 'scale_pos_weight': 2, 'reg_lambda': 10, 'n_estimators': 100, 'max_depth': 1, 'learning_rate': 0.01, 'gamma': 5} with CV score=1.0:
Wall time: 51.2 s
# building model with best parameters
xgb_tuned_rcv = XGBClassifier(
random_state=1,
n_estimators=100,
scale_pos_weight=2,
subsample=0.8,
learning_rate=0.01,
gamma=5,
eval_metric="logloss",
reg_lambda=10,
max_depth=1,
)
# Fit the model on training data
xgb_tuned_rcv.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
eval_metric='logloss', gamma=5, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.01, max_delta_step=0,
max_depth=1, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=1,
reg_alpha=0, reg_lambda=10, scale_pos_weight=2, subsample=0.8,
tree_method='exact', validate_parameters=1, verbosity=None)
# calculating different metrics
xgb_tuned_rcv_score, xgb_tuned_rcv_list = get_metrics_score(
xgb_tuned_rcv, False
)
# model performance
xgb_tuned_rcv_score
| DataSet | Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|---|
| 0 | Training | 0.839342 | 1.0 | 0.839342 | 0.912654 |
| 1 | Validation | 0.839092 | 1.0 | 0.839092 | 0.912507 |
| 2 | Testing | 0.839585 | 1.0 | 0.839585 | 0.912798 |
# use draw_matrix function
draw_matrix(xgb_tuned_rcv, X_val, y_val)
Observations:
# Choose the type of classifier.
gbc = GradientBoostingClassifier(random_state=1)
# Parameter grid
param_grid_gbc ={"n_estimators":np.arange(50,150,50),
"learning_rate":[0.01,0.1,0.2,0.05],
"subsample":[0.8,0.9,1],
"max_depth":np.arange(1,5,1),
"min_samples_leaf": np.arange(1,6,1),
"max_features":["log2",0.7,0.9,"auto"],
"max_depth":np.arange(1,5,1)}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
%%time
# Run the grid search
gbc_tuned1 = GridSearchCV(estimator=gbc, param_grid=param_grid_gbc, scoring=scorer, cv=5, n_jobs = -1, verbose= 2)
# fit the model
gbc_tuned1.fit(X_train, y_train)
# print best parameters
print("Best parameters are {} with CV score={}:" .format(gbc_tuned1.best_params_,gbc_tuned1.best_score_))
Fitting 5 folds for each of 1920 candidates, totalling 9600 fits
Best parameters are {'learning_rate': 0.01, 'max_depth': 1, 'max_features': 'log2', 'min_samples_leaf': 1, 'n_estimators': 50, 'subsample': 0.8} with CV score=1.0:
Wall time: 30min 43s
# Choose the type of classifier.
gbc_tuned_gcv = GradientBoostingClassifier (
random_state=1,
n_estimators=50,
learning_rate=0.01,
subsample=0.8,
max_depth=1,
min_samples_leaf=1,
max_features="log2",
)
# Fit the model on training data
gbc_tuned_gcv.fit(X_train, y_train)
GradientBoostingClassifier(learning_rate=0.01, max_depth=1, max_features='log2',
n_estimators=50, random_state=1, subsample=0.8)
# calculating different metrics
gbc_tuned_gcv_score, gbc_tuned_gcv_list = get_metrics_score(
gbc_tuned_gcv, False
)
# model performance
gbc_tuned_gcv_score
| DataSet | Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|---|
| 0 | Training | 0.839342 | 1.0 | 0.839342 | 0.912654 |
| 1 | Validation | 0.839092 | 1.0 | 0.839092 | 0.912507 |
| 2 | Testing | 0.839585 | 1.0 | 0.839585 | 0.912798 |
# use draw_matrix function
draw_matrix(gbc_tuned_gcv, X_val, y_val)
Observations:
%%time
# Calling RandomizedSearchCV
gbc_tuned2 = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid_gbc, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
# Fitting parameters
gbc_tuned2.fit(X_train,y_train)
# print best parameters
print("Best parameters are {} with CV score={}:" .format(gbc_tuned2.best_params_,gbc_tuned2.best_score_))
[23:23:33] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.0/src/learner.cc:576:
Parameters: { "max_features", "min_samples_leaf" } might not be used.
This could be a false alarm, with some parameters getting used by language bindings but
then being mistakenly passed down to XGBoost core, or some parameter actually being used
but getting flagged wrongly here. Please open an issue if you find any such cases.
Best parameters are {'subsample': 0.8, 'n_estimators': 50, 'min_samples_leaf': 4, 'max_features': 0.7, 'max_depth': 1, 'learning_rate': 0.01} with CV score=1.0:
Wall time: 52.2 s
# Choose the type of classifier.
gbc_tuned_rcv = GradientBoostingClassifier (
random_state=1,
n_estimators=50,
learning_rate=0.01,
subsample=0.8,
max_depth=1,
min_samples_leaf=4,
max_features=0.7,
)
# Fit the model on training data
gbc_tuned_rcv.fit(X_train, y_train)
GradientBoostingClassifier(learning_rate=0.01, max_depth=1, max_features=0.7,
min_samples_leaf=4, n_estimators=50, random_state=1,
subsample=0.8)
# calculating different metrics
gbc_tuned_rcv_score, gbc_tuned_rcv_list = get_metrics_score(
gbc_tuned_rcv, False
)
# model performance
gbc_tuned_rcv_score
| DataSet | Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|---|
| 0 | Training | 0.839342 | 1.0 | 0.839342 | 0.912654 |
| 1 | Validation | 0.839092 | 1.0 | 0.839092 | 0.912507 |
| 2 | Testing | 0.839585 | 1.0 | 0.839585 | 0.912798 |
# use draw_matrix function
draw_matrix(gbc_tuned_rcv, X_val, y_val)
Observations:
# create dataframe
comparison_frame1 = pd.DataFrame({"Model":["Accuracy - Training",
"Accuracy - Validation",
"Accuracy - Test",
"Recall - Training",
"Recall - Validation",
"Recall - Test",
"Precision - Training",
"Precision - Validation",
"Precision - Test",
"F1 - Training",
"F1 - Validation",
"F1 - Test"],
"Random Forest - Grid Search": rfc_tuned_gcv_list,
"Random Forest - Randomized Search": rfc_tuned_rcv_list,
"XG Boost - Grid Search": xgb_tuned_gcv_list,
"XG Boost - Randomized Search": xgb_tuned_rcv_list,
"Gradient Boost - Grid Search": gbc_tuned_gcv_list,
"Gradient Boost - Randomized Search": gbc_tuned_rcv_list}
)
comparison_frame1
| Model | Random Forest - Grid Search | Random Forest - Randomized Search | XG Boost - Grid Search | XG Boost - Randomized Search | Gradient Boost - Grid Search | Gradient Boost - Randomized Search | |
|---|---|---|---|---|---|---|---|
| 0 | Accuracy - Training | 0.839342 | 0.839342 | 0.839342 | 0.839342 | 0.839342 | 0.839342 |
| 1 | Accuracy - Validation | 0.839092 | 0.839092 | 0.839092 | 0.839092 | 0.839092 | 0.839092 |
| 2 | Accuracy - Test | 0.839585 | 0.839585 | 0.839585 | 0.839585 | 0.839585 | 0.839585 |
| 3 | Recall - Training | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 4 | Recall - Validation | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 5 | Recall - Test | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 6 | Precision - Training | 0.839342 | 0.839342 | 0.839342 | 0.839342 | 0.839342 | 0.839342 |
| 7 | Precision - Validation | 0.839092 | 0.839092 | 0.839092 | 0.839092 | 0.839092 | 0.839092 |
| 8 | Precision - Test | 0.839585 | 0.839585 | 0.839585 | 0.839585 | 0.839585 | 0.839585 |
| 9 | F1 - Training | 0.912654 | 0.912654 | 0.912654 | 0.912654 | 0.912654 | 0.912654 |
| 10 | F1 - Validation | 0.912507 | 0.912507 | 0.912507 | 0.912507 | 0.912507 | 0.912507 |
| 11 | F1 - Test | 0.912798 | 0.912798 | 0.912798 | 0.912798 | 0.912798 | 0.912798 |
Observations:
feature_names = X.columns
importances = xgb_tuned_rcv.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
Observations:
Total_Trans_Ct is the most important variable, followed by Total_Ct_Chng_Q4_Q1 and Total_Trans_Amt.
Now, we have a final model. let's use pipelines to put the model into production
# creating a list of numerical variables
numerical_features = [
"CLIENTNUM",
"Customer_Age",
"Dependent_count",
"Months_on_book",
"Total_Relationship_Count",
"Months_Inactive_12_mon",
"Contacts_Count_12_mon",
"Credit_Limit",
"Total_Revolving_Bal",
"Avg_Open_To_Buy",
"Total_Amt_Chng_Q4_Q1",
"Total_Trans_Amt",
"Total_Trans_Ct",
"Total_Ct_Chng_Q4_Q1",
"Avg_Utilization_Ratio",
]
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
# creating a list of categorical variables
categorical_features = [
"Gender",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category",
]
# creating a transformer for categorical variables, which will first apply simple imputer and
#then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
# Separating target variable and other variables
X = df.drop(columns="Attrition_Flag")
Y = df["Attrition_Flag"]
We already have the model to be tuned, so we will need only Training and Testing Sets.
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 20) (3039, 20)
# Creating new pipeline with best parameters
model = Pipeline(
steps=[
("PRE", preprocessor),
(
"XGB",
XGBClassifier(
random_state=1,
n_estimators=100,
scale_pos_weight=2,
subsample=0.8,
learning_rate=0.01,
gamma=5,
eval_metric="logloss",
reg_lambda=10,
max_depth=1,
),
),
]
)
# Fit the model on training data
model.fit(X_train, y_train)
Pipeline(steps=[('PRE',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median'))]),
['CLIENTNUM', 'Customer_Age',
'Dependent_count',
'Months_on_book',
'Total_Relationship_Count',
'Months_Inactive_12_mon',
'Contacts_Count_12_mon',
'Credit_Limit',
'Total_Revolving_Bal',
'Avg_Open_To_Buy',
'Total_...
importance_type=None, interaction_constraints='',
learning_rate=0.01, max_delta_step=0,
max_depth=1, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100,
n_jobs=8, num_parallel_tree=1, predictor='auto',
random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=2, subsample=0.8,
tree_method='exact', validate_parameters=1,
verbosity=None))])
model.predict(X_test)
array([1, 1, 1, ..., 1, 1, 1])
# Module : Feature Selection, Model Selection and Tuning
# Project: thera bank credit card churners
# Submitted by : Ritesh Sharma
# Submission Date : 22 Oct 2021